import requests
import time
import  xlwt
"""
Author:@JosonLe
at：2017/12/20
for：学习&&测试玩
Data：爬取拉勾网所有关于Python相关职业的招聘信息
这儿有三个坑:
一是Cookie，不添加的话，只能爬几页就被禁，Cookie更新的知识点等后续再学吧，这里是直接粘贴来的，下次用要重新粘贴
二是Referer，Referer在headers中起到告诉服务器链接是从哪个页面来的，一般都加上为好
三是url，URL是请求json数据的URL（network中可看），不是页面的URL
PS:excel=xlwt.Workbook()创建Excel，写入信息后一定要保存！！！
   excel.save(filename)只能是xls格式
   创建sheet表，写入内容必须是字符串
分析可得，所有关于python的工作，URL分为两种（有实习和没实习的）如下，仅仅是from data中kd参数变化了。
    'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=1'
    '''first:true,pn:1,kd:python爬虫''' '------>多了个px=default&,而且isSchoolJob=1'
    'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0'
    '''first:true,pn:1,kd:python后端'''
    'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0'
    'first:true,pn:1,kd:python数据'
    'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=1'
    '''first:true,pn:1,kd:机器学习实习'''   
"""

header={'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
        ,'Referer':
        'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
        ,'Cookie':
        'user_trace_token=20170825165308-d0dccc6e-8972-11e7-8ed1-5254005c3644; LGUID=20170825165308-d0dccf15-8972-11e7-8ed1-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAAGGABCBFCCDE8E55C909984F59D7C9C87593DF6; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%25E5%2590%258E%25E7%25AB%25AF%3Foquery%3Dpython%26fromSearch%3Dtrue%26labelWords%3Drelative; _ga=GA1.2.1264438592.1503651188; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1513764825,1513778395,1513778395; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1513779671; LGSID=20171220215919-f94f8cae-e58d-11e7-a335-525400f775ce; LGRID=20171220222013-e50cba62-e590-11e7-a336-525400f775ce; TG-TRACK-CODE=search_code; SEARCH_ID=ac689e45d0fb451c8a57a7764c71bd2f'
        }

url='https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0'

def getLagou(url,page):
    data = {
        'first': 'false',
        'pn': page,
        'kd': 'python数据'
    }
    response=requests.post(url,data=data,headers=header)
    #print(type(response.text))
    result=response.json()['content']['positionResult']['result']
    print(type(result))
    #print(result)
    #print(type(result[0]))
    for DATA in result:
        #print(DATA)
        print('公司:',DATA['companyFullName'])
        print('行业:',DATA['industryField'])
        print('学历：',DATA['education'])
        print('技能要求:',DATA['positionLables'])
        print('工作经验：',DATA['workYear'])
        print('工作城市：',DATA['city'])
        print('地点：', DATA['district'])
        print('工作优势：',DATA['positionAdvantage'])
        print('基本福利：',DATA['companyLabelList'])
        print('薪资：',DATA['salary'])
        print('岗位：',DATA['positionName'])
        print('工作类型：',DATA['firstType'],DATA['secondType'])
        print('是否全职：', DATA['jobNature'])
        print('**********')
        #break
    print('第%d页请求成功'%page)
    return result
#写入Excel
excel=xlwt.Workbook()#创建Excel文件
sheet1=excel.add_sheet('LagouWang',cell_overwrite_ok=True)#cell_overwrite=True，重复写数据选择覆盖
sheet1.write(0, 0, '公司')
sheet1.write(0, 1, '行业')
sheet1.write(0, 2, '学历')
sheet1.write(0, 3, '技能要求')
sheet1.write(0, 4, '工作经验')
sheet1.write(0, 5, '工作城市')
sheet1.write(0, 6, '工作地点')
sheet1.write(0, 7, '工作优势')
sheet1.write(0, 8, '基本福利')
sheet1.write(0, 9, '薪资')
sheet1.write(0, 10, '岗位')
sheet1.write(0, 11, '工作类型')
sheet1.write(0, 12, '是否全职')

Jobs=[]
for page in range(1,31):
    Jobs += getLagou(url,page)
    time.sleep(2)  #最好加上，爬太快了不行
i=0
for job in Jobs:
    i=i+1
    positionLables = ''
    for j in job['positionLables']:
        positionLables += j + ','
    companyLabelList = ''
    for j in job['companyLabelList']:
        companyLabelList += j + ','
    sheet1.write(i, 0, job['companyFullName'])
    sheet1.write(i, 1, job['industryField'])
    sheet1.write(i, 2, job['education'])
    sheet1.write(i, 3,positionLables)
    sheet1.write(i, 4, job['workYear'])
    sheet1.write(i, 5, job['city'])
    sheet1.write(i,6,job['district'])
    sheet1.write(i, 7, job['positionAdvantage'])
    sheet1.write(i, 8, companyLabelList)
    sheet1.write(i, 9, job['salary'])
    sheet1.write(i, 10, job['positionName'])
    sheet1.write(i, 11, job['firstType']+job['secondType'])
    sheet1.write(i, 12, job['jobNature'])

excel.save('LaGouWang Position Data to dataProcessing.xls')